--- title: Analyze keywords: fastai sidebar: home_sidebar summary: "This Notebook analyzes various concepts for anomaly detection" description: "This Notebook analyzes various concepts for anomaly detection" nb_path: "03_analyze.ipynb" ---
from anomaly.utils import *
from anomaly.binet import *
import pandas as pd
def _shift_columns (a,ws=3): return np.dstack(list(reversed([np.roll(a,i) for i in range(0,ws)])))[0]
def subsequences_fast(df,event_ids,ws=None,min_ws=64):
max_trace_len=int(event_ids.max())+1
if not ws: ws=max_trace_len-1
elif ws <max_trace_len-1: raise ValueError(f"ws must be greater equal {max_trace_len-1}")
pad=ws
ws=max(min_ws,ws)
trace_start = np.where(event_ids == 0)[0]
trace_len=np.array([trace_start[i]-trace_start[i-1] for i in range(1,len(trace_start))]+[len(df)-trace_start[-1]])
tmp=np.stack([_shift_columns(df[i],ws=ws) for i in list(df)])
idx=[range(trace_start[i],trace_start[i]+trace_len[i]) for i in range(len(trace_start))]
idx=np.array([y for x in idx for y in x])
res=np.rollaxis(tmp,1)[idx]
mask=ws-1-event_ids[idx][:,None] > np.arange(res.shape[2])
res[np.broadcast_to(mask[:,None],res.shape)]=0
return res[:-1],(idx+1)[:-1]
def _shift_columns (a,ws=3): return np.dstack(list(reversed([np.roll(a,i) for i in range(0,ws)])))[0]
def windows_fast(df,event_ids,ws=5,pad=None):
max_trace_len=int(event_ids.max())+1
trace_start = np.where(event_ids == 0)[0]
trace_len=[trace_start[i]-trace_start[i-1] for i in range(1,len(trace_start))]+[len(df)-trace_start[-1]]
idx=[range(trace_start[i]+(i+1)
*(ws-1),trace_start[i]+trace_len[i]+(i+1)*(ws-1)-1) for i in range(len(trace_start))]
idx=np.array([y for x in idx for y in x])
trace_start = np.repeat(trace_start, ws-1)
tmp=np.stack([_shift_columns(np.insert(np.array(df[i]), trace_start, 0, axis=0),ws=ws) for i in list(df)])
tmp=np.rollaxis(tmp,1)
res=tmp[idx]
if pad: res=np.pad(res,((0,0),(0,0),(pad-ws,0)))
return res,np.where(event_ids != 0)[0]
class TestModel(nn.Module):
def __init__(self, pp_data ,is_cuda=False,vocab_col='activity'):
super().__init__()
vocab_size=len(pp_data.procs.categorify[vocab_col])
self.vocab_index={s:i for i,s in enumerate(pp_data.cat_names[0])}[vocab_col]
n_fac, n_hidden=round(sqrt(vocab_size))+1, round(sqrt(vocab_size)*2)
self.n_hidden=n_hidden
self.is_cuda=is_cuda
self.e = nn.Embedding(vocab_size,n_fac)
self.l_in = nn.Linear(n_fac, n_hidden)
self.l_hidden = nn.Linear(n_hidden, n_hidden)
self.l_bottleneck = nn.Linear(n_hidden, 2)
self.l_out = nn.Linear(2, vocab_size)
def forward(self, xb):
cs=xb.permute(1,2,0)[self.vocab_index]
bs = len(cs[0])
h = torch.zeros((bs,self.n_hidden))
if self.is_cuda: h=h.cuda()
for c in cs:
inp = torch.relu(self.l_in(self.e(c)))
h = torch.tanh(self.l_hidden(h+inp))
h = self.l_bottleneck(h)
return F.log_softmax(self.l_out(h),dim=0)
class Camargo_specialized_bottleneck(torch.nn.Module) :
def __init__(self, o) :
super().__init__()
hidden=25
vocab_act=len(o.procs.categorify['activity'])
emb_dim_act = int(sqrt(vocab_act))+1
self.emb_act = nn.Embedding(vocab_act,emb_dim_act)
self.lstm_act = nn.LSTM(emb_dim_act, hidden, batch_first=True, num_layers=2)
self.l_bottleneck = nn.Linear(hidden, 2)
self.linear_act = nn.Linear(2, vocab_act)
def forward(self, xcat):
x_act=xcat
x_act = self.emb_act(x_act)
x_act,_ = self.lstm_act(x_act)
x_act = x_act[:,-1]
x_act = self.l_bottleneck(x_act)
x_act = self.linear_act(x_act)
return x_act
class Camargo_specialized(torch.nn.Module) :
def __init__(self, o) :
super().__init__()
hidden=25
vocab_act=len(o.procs.categorify['activity'])
emb_dim_act = int(sqrt(vocab_act))+1
self.emb_act = nn.Embedding(vocab_act,emb_dim_act)
self.lstm_act = nn.LSTM(emb_dim_act, hidden, batch_first=True, num_layers=2)
self.linear_act = nn.Linear(hidden, vocab_act)
def forward(self, xcat):
x_act=xcat
x_act = self.emb_act(x_act)
x_act,_ = self.lstm_act(x_act)
x_act = x_act[:,-1]
x_act = self.linear_act(x_act)
#x_act = F.softmax(x_act,dim=1)
#very bad for anomaly detection tasks and in general for tasks where multiple outcomes are resonable
return x_act
event_df, test_df, df_truth = load_data(data='PDC2020')
trace_df = pd.DataFrame(index= event_df['trace_id'].unique())
log = event_df
cols,outcome='activity',False
o=PPObj(log,procs=Categorify(),cat_names=cols,y_names=cols,splits=split_traces(log))
dls=o.get_dls(outcome=outcome,windows=partial(windows_fast))
m=Camargo_specialized_bottleneck(o)
train_validate(dls,m,epoch=10)
o=PPObj(log,procs=Categorify(),cat_names=cols,y_names=cols)
o.items
wds,idx=windows_fast(o.xs, o.event_ids)
y = o.items['activity'].iloc[idx].values
res=(m(LongTensor(wds.squeeze()).cuda()))
wds
len(res), len(wds)
event_df.reset_index(drop=True, inplace=True)
event_df.drop(event_df[event_df.event_id == 0].index, inplace=True)
event_df.index = event_df['trace_id']
event_df
res.shape
y.shape
len(o.items.index)
a = AnomalyDetection(res, y, event_df, df_truth.loc[df_truth['normal']== False],binet=True)
a(threshold='gmean', analyze=True,s=20)
activation = {}
def get_activation(name):
def hook(model, input, output):
activation[name] = output.detach()
return hook
m.l_bottleneck.register_forward_hook(get_activation('self.l_bottleneck'))
output = (m(LongTensor(wds.squeeze()).cuda()))
import plotly.graph_objects as go
x1 = activation['self.l_bottleneck'][:,0].cpu()
y1 = activation['self.l_bottleneck'][:,1].cpu()
fig = go.Figure(data=go.Scattergl(
x = x1,
y = y1,
mode='markers',
))
fig.show()
from sklearn.cluster import DBSCAN
clustering = DBSCAN(eps=0.3, min_samples=10).fit(activation['self.l_bottleneck'].cpu())
unique_labels= set(clustering.labels_)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
color_indices = clustering.labels_
colormap = matplotlib.colors.ListedColormap(colors)
colormap.colors[0] = (0.,0.,0.,1.)
import plotly.express as px
df_plot = pd.DataFrame({'x': x1, 'y': y1, 'c': color_indices})
df_plot["c"] = df_plot["c"].astype(str)
fig = px.scatter(df_plot, x="x", y="y", color="c")
fig.show()
anomalies = np.where(clustering.labels_ == -1)[0].tolist()
len(anomalies)
anomalies = event_df.iloc[anomalies]['trace_id'].unique()
len(anomalies)
truth = df_truth.loc[df_truth['normal']==False]['case'].unique()
len(set(truth).intersection(set(anomalies)))
f1score(truth, anomalies)